/* This file takes industry level employment and wage variables for each county,
maps them to the correct industry using the subsidy data, and then fills this so 
that we can compare employment and wages in every county, even 0s. Then, the file does the
same for the commuting zone. 
* Before running this file, you need to run build_cbp, in order to create the 
necessary .dta files of employment and wages at the county-naics level. 
**********************************************************************************/

* start with the naics4, then fillin so we have all possible fips, then  merge in naics3 and naics2
*************************************************************************
u "$datadir/CBP_county_ind", clear
egen id = group(fipscty fips)
fillin id naics4 year
gsort id -fipscty -fips

by id: replace fips=fips[1] if _fillin==1
by id: replace fipscty=fipscty[1] if _fillin==1
replace est=0 if _fillin==1
drop _fillin 

g avpay_n4 = ap/emp
drop ap emp est1_9-est1000_4 empflag

bys id year: egen tot_est=sum(est)
g perc_est_n4 = est/tot_est

merge m:1 naics4 year using $datadir/sample_year_naics, keep(3) nogen //select correct sample 

replace naics2=31 if inlist(naics2,32,33)
replace naics2=44 if naics2==45
replace naics2=48 if naics2==49

* merge in the naics3 and naics2 level measures 
merge m:1 naics3 year fips fipscty using "$datadir/CBP_county_ind_n3", keep(1 3) ///
	keepusing(ap emp_n3) nogen 
g avpay_n3 = ap/emp_n3
drop ap emp

merge m:1 naics2 year fips fipscty using "$datadir/CBP_county_ind_n2", keep(1 3) ///
	keepusing(ap emp_n2) nogen 
g avpay_n2 = ap/emp_n2
drop ap emp

replace year=year+1 // now we are in year of subsidy deal 
rename est est_n4 

rename * *_rect
rename year year
rename naics4 naics4 

label var est "number of establishments in county (naics4)"
label var avpay_n2 "average wages in county (naics2)"
label var avpay_n3 "average wages in county (naics3)"
label var avpay_n4 "average wages in county (naics4)"
label var perc_est_n4 "percent of establishments in county in naics4"

drop naics3 naics2 
	
levelsof naics4, local(ind)

g raw_est_n4 = est_n4

foreach v in `ind' {
	qui summ est_n4 if naics4==`v' 
	replace est_n4 = (est_n4 - r(mean))/r(sd) if naics4==`v'
	}

g wage = avpay_n4
replace wage = avpay_n3 if wage==.

g ln_wage = log(wage)

rename *_rect *
rename fips statefip
keep statefip year naics4 fipscty est_n4 raw_est_n4 wage ln_wage perc_est_n4
sa "$datadir/county_naics_rect", replace

***********************************************************************************
**** change the level of aggregation in CBP_ files ****
u "$datadir/CBP_county_ind", clear

ren fips statefip
merge m:1 statefip fipscty using "$temp/cty to cz", keep(1 3)nogen
drop if mi(commutingzoneid2000)
drop fips
ren statefip fips
save "$temp/with cz", replace

forval i = 2/4{
	u "$temp/with cz", clear
	
	foreach v of varlist emp ap est{
		replace `v' = 0 if mi(`v')
	}
	collapse (rawsum)emp ap est, by(fips year naics`i' commutingzoneid2000)
	save "$temp/CBP_cz_ind`i'", replace
}

* repeat the above to create the CZ level file. 
***********************************************************************
u "$temp/CBP_cz_ind4", clear
egen cz_id = group(commutingzoneid2000 fips)
fillin cz_id naics4 year
gsort cz_id -commutingzoneid2000 -fips

by cz_id: replace fips=fips[1] if _fillin==1
by cz_id: replace commutingzoneid2000=commutingzoneid2000[1] if _fillin==1
replace est=0 if _fillin==1
drop _fillin 
g avpay_n4 = ap/emp
drop ap emp

bys cz_id year: egen tot_est=sum(est)
g perc_est_n4 = est/tot_est

merge m:1 naics4 year using $datadir/sample_year_naics, keep(3) nogen

gen naics3=floor(naics4/10)
gen naics2=floor(naics3/10)
replace naics2=31 if inlist(naics2,32,33)
replace naics2=44 if naics2==45
replace naics2=48 if naics2==49

merge m:1 naics3 year fips commutingzoneid2000 using "$temp/CBP_cz_ind3", keep(1 3) nogen 
g avpay_n3 = ap/emp
drop ap emp

merge m:1 naics2 year fips commutingzoneid2000 using "$temp/CBP_cz_ind2", keep(1 3) nogen 
g avpay_n2 = ap/emp
drop ap emp

replace year=year+1
rename est est_n4 

rename * *_rect
rename year year
rename naics4 naics4 

label var est "number of establishments in commuting zone (naics4)"
label var avpay_n2 "average wages in commuting zone (naics2)"
label var avpay_n3 "average wages in commuting zone (naics3)"
label var avpay_n4 "average wages in commuting zone (naics4)"
label var perc_est_n4 "percent of establishments in cz in naics4"

drop naics3 naics2 
	
g wage = avpay_n4
replace wage = avpay_n3 if wage==.
replace wage = avpay_n2 if wage==. 

g ln_wage = log(wage)

rename *_rect *
rename fips statefip
keep statefip year naics4 commutingzone wage ln_wage perc_est_n4 est_n4

foreach v in perc_est_n4 wage ln_wage {
		winsor2 `v', cut(1 99) replace
}

sa $datadir/cz_naics_rect, replace

*****************************************************************************
*combine both to make a wage specific file, will use for estimating non-subsidy tax credit/grants:
*****************************************************************************
u $datadir/county_naics_rect, clear
keep statefip year fipscty naics4 wage
rename wage wage_county
merge m:1 fipscty statefip using $datadir/cty2cz, nogen
merge m:1 statefip commutingzoneid naics4 year using $datadir/CZ_naics_rect, nogen ///
	keep(1 3) keepusing(wage commutingzone)
rename wage wage_cz

order statefip fipscty fips commuting county year naics4 wage*
duplicates tag countyname year naics4 statefip, gen(dup)
drop if dup>0
drop dup

sa $datadir/wage_naics_rect, replace
